# Introduction #########
#
#
# The following is an analysis of one-hundred styles of beer brewed in the United States for the executive team,
# CEO and CFO at Budweiser. Budweiser is interested in exploring the how many breweries are in the United States,
# how each beer is reported in terms of its International Bitterness Unit and Alcohol By Content and basic summary
# statistics and conclusions we are able to uncover with the beer data provided. Statistics will include handling
# missing data and explaining why it was possibly not included in the initial dataset, as well as uncover median
# and maximum (IBU and ABV) ratings by state. Conclusions will include basic summary statics on the ABV variable,
# any relationship between the IBU and ABV variables (such as dependencies, e.g. does a higher IBU result in a
# higher ABV) and finally we will look to see if we can determine general beer styles (Ales and IPAs) based on
# ABV and IBU values. Additionally, we will report on any findings that are discovered during the analysis.
#
######################
# #
# Libraries #
# #
######################
######################
library(usmap)
library(ggplot2)
library(magrittr)
library(ggplot2)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(readr)
library(tibble)
library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ purrr 0.3.4 ✓ forcats 0.5.0
## ✓ dplyr 1.0.2
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x tidyr::extract() masks magrittr::extract()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x purrr::set_names() masks magrittr::set_names()
library(robustbase)
library(class)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(e1071)
library(dplyr)
library(codebook)
library(future)
##
## Attaching package: 'future'
## The following object is masked from 'package:caret':
##
## cluster
library(fmsb)
## Registered S3 methods overwritten by 'fmsb':
## method from
## print.roc pROC
## plot.roc pROC
library(ggraph)
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:future':
##
## %->%, %<-%
## The following object is masked from 'package:class':
##
## knn
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:purrr':
##
## compose, simplify
## The following object is masked from 'package:tidyr':
##
## crossing
## The following object is masked from 'package:tibble':
##
## as_data_frame
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(RColorBrewer)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
######################
######################
# #
# Data #
# #
######################
########################################################
#read in brewery data
breweryDat <- read.csv("breweries.csv")
breweryDat$State <- trimws(breweryDat$State)
#datafile to organize states into census regions
regionData <- read.csv("geocodes.csv")
regionData <- regionData[,-c(1,2)]
regionData <- dplyr::rename(regionData, "FIPS"="State..FIPS.", "Region" = "Region.1", "Division" = "Division.1")
regionData$State <- trimws(regionData$State)
#Ensure structure of data is compliant
#head(breweryDat)
#read in beer data
beerDat <- read.csv("beers.csv")
#Loop to fix leading decimal places on ABV
i <- 1
count <- length(beerDat$Name)
for (i in 1:count) {
if(is.na(beerDat[i,3])){
beerDat[i,3]=0
}
if(beerDat[i,3]<1){
beerDat[i,3] <- beerDat[i,3]*100
}
}
#Ensure structure of data is compliant
#head(beerDat)
########################################################
#Instructions You can assume that your audience is the CEO and CFO of Budweiser (your client) and that they only have had one class in statistics and have indicated that you cannot take more than 7 minutes of their time. 20% of your grade will be based on the presentation.
They have hired you to answer the 7 questions listed below and beyond those general questions you may speculate / anticipate what may be of interest to them
######################
# #
# Question 1 #
# #
######################
########################################################
#How many breweries are present in each state?
########################################################
#########################################################
#Use Dplyr to group breweries by state
brewByState <- breweryDat %>%
group_by(State) %>%
dplyr::count()
brewByState
#########################################################
#########################################################
#Add breweries by state to state information dataframe
statepop$brewByState <- brewByState$n
View(brewByState)
#########################################################
#########################################################
#Fix mismatched state brewery count to state info df
statepop[1,5] <- 3
statepop[2,5] <- 7
statepop[3,5] <- 11
statepop[4,5] <- 2
statepop[8,5] <- 2
statepop[9,5] <- 1
statepop[14,5] <- 18
statepop[15,5] <- 22
statepop[16,5] <- 5
statepop[20,5] <- 9
statepop[22,5] <- 23
statepop[25,5] <- 2
statepop[26,5] <- 9
statepop[28,5] <- 5
statepop[29,5] <- 2
statepop[30,5] <- 3
statepop[32,5] <- 4
statepop[34,5] <- 19
statepop[33,5] <- 16
statepop[35,5] <- 1
statepop[45,5] <- 4
statepop[46,5] <- 10
statepop[47,5] <- 16
statepop[49,5] <- 1
statepop[50,5] <- 20
#Check data
#View(statepop)
#View(brewByState)
#########################################################
#########################################################
#Call plot functions to plot state brewery count on USmap
nationBrewPlot <- plot_usmap(data = statepop, values = "brewByState",labels=TRUE, color = "grey73") + scale_fill_continuous(low = "purple", high = "green", name = "Brewery Count", label = scales::comma) + theme(legend.position = "bottom")+labs(title = "Total Brewery Count Per State")
#display plot
nationBrewPlot
## Warning: Use of `map_df$x` is discouraged. Use `x` instead.
## Warning: Use of `map_df$y` is discouraged. Use `y` instead.
## Warning: Use of `map_df$group` is discouraged. Use `group` instead.
## Warning: Use of `centroid_labels$x` is discouraged. Use `x` instead.
## Warning: Use of `centroid_labels$y` is discouraged. Use `y` instead.
## Warning: Use of `centroid_labels$abbr` is discouraged. Use `abbr` instead.
#########################################################
#########################################################
#Break down by region, NE first
NEplot <- plot_usmap(data=statepop, values = "brewByState",labels = TRUE,include = .new_england,color = "grey73") + scale_fill_continuous(low = "purple", high = "green", name = "Brewery Count", label = scales::comma)+ theme(legend.position = "bottom")+labs(title = "Total Brewery Count Per State")
NEplot
## Warning: Use of `map_df$x` is discouraged. Use `x` instead.
## Warning: Use of `map_df$y` is discouraged. Use `y` instead.
## Warning: Use of `map_df$group` is discouraged. Use `group` instead.
## Warning: Use of `centroid_labels$x` is discouraged. Use `x` instead.
## Warning: Use of `centroid_labels$y` is discouraged. Use `y` instead.
## Warning: Use of `centroid_labels$abbr` is discouraged. Use `abbr` instead.
#########################################################
#########################################################
#Break down by region, Mid Atlantic second
MAplot <- plot_usmap(data=statepop, values = "brewByState",labels = TRUE,include = .mid_atlantic,color = "grey73") + scale_fill_continuous(low = "purple", high = "green", name = "Brewery Count", label = scales::comma)+ theme(legend.position = "bottom")+labs(title = "Total Brewery Count Per State")
MAplot
## Warning: Use of `map_df$x` is discouraged. Use `x` instead.
## Warning: Use of `map_df$y` is discouraged. Use `y` instead.
## Warning: Use of `map_df$group` is discouraged. Use `group` instead.
## Warning: Use of `centroid_labels$x` is discouraged. Use `x` instead.
## Warning: Use of `centroid_labels$y` is discouraged. Use `y` instead.
## Warning: Use of `centroid_labels$abbr` is discouraged. Use `abbr` instead.
#########################################################
#########################################################
#Break down by region, East North Central third
ENCplot <- plot_usmap(data=statepop, values = "brewByState",labels = TRUE,include = .east_north_central,color = "grey73") + scale_fill_continuous(low = "purple", high = "green", name = "Brewery Count", label = scales::comma)+ theme(legend.position = "bottom")+labs(title = "Total Brewery Count Per State")
ENCplot
## Warning: Use of `map_df$x` is discouraged. Use `x` instead.
## Warning: Use of `map_df$y` is discouraged. Use `y` instead.
## Warning: Use of `map_df$group` is discouraged. Use `group` instead.
## Warning: Use of `centroid_labels$x` is discouraged. Use `x` instead.
## Warning: Use of `centroid_labels$y` is discouraged. Use `y` instead.
## Warning: Use of `centroid_labels$abbr` is discouraged. Use `abbr` instead.
#########################################################
#########################################################
#Break down by region, West North Central fourth
WNCplot <- plot_usmap(data=statepop, values = "brewByState",labels = TRUE,include = .west_north_central,color = "grey73") + scale_fill_continuous(low = "purple", high = "green", name = "Brewery Count", label = scales::comma)+ theme(legend.position = "bottom")+labs(title = "Total Brewery Count Per State")
WNCplot
## Warning: Use of `map_df$x` is discouraged. Use `x` instead.
## Warning: Use of `map_df$y` is discouraged. Use `y` instead.
## Warning: Use of `map_df$group` is discouraged. Use `group` instead.
## Warning: Use of `centroid_labels$x` is discouraged. Use `x` instead.
## Warning: Use of `centroid_labels$y` is discouraged. Use `y` instead.
## Warning: Use of `centroid_labels$abbr` is discouraged. Use `abbr` instead.
#########################################################
#########################################################
#Break down by region, South Atlantic fifth
SAplot <- plot_usmap(data=statepop, values = "brewByState",labels = TRUE,include = .south_atlantic,color = "grey73") + scale_fill_continuous(low = "purple", high = "green", name = "Brewery Count", label = scales::comma)+ theme(legend.position = "bottom")+labs(title = "Total Brewery Count Per State")
SAplot
## Warning: Use of `map_df$x` is discouraged. Use `x` instead.
## Warning: Use of `map_df$y` is discouraged. Use `y` instead.
## Warning: Use of `map_df$group` is discouraged. Use `group` instead.
## Warning: Use of `centroid_labels$x` is discouraged. Use `x` instead.
## Warning: Use of `centroid_labels$y` is discouraged. Use `y` instead.
## Warning: Use of `centroid_labels$abbr` is discouraged. Use `abbr` instead.
#########################################################
#########################################################
#Break down by region, East South Central sixth
ESCplot <- plot_usmap(data=statepop, values = "brewByState",labels = TRUE,include = .east_south_central,color = "grey73") + scale_fill_continuous(low = "purple", high = "green", name = "Brewery Count", label = scales::comma)+ theme(legend.position = "bottom")+labs(title = "Total Brewery Count Per State")
ESCplot
## Warning: Use of `map_df$x` is discouraged. Use `x` instead.
## Warning: Use of `map_df$y` is discouraged. Use `y` instead.
## Warning: Use of `map_df$group` is discouraged. Use `group` instead.
## Warning: Use of `centroid_labels$x` is discouraged. Use `x` instead.
## Warning: Use of `centroid_labels$y` is discouraged. Use `y` instead.
## Warning: Use of `centroid_labels$abbr` is discouraged. Use `abbr` instead.
#########################################################
#########################################################
#Break down by region, West South Central seventh
WSCplot <- plot_usmap(data=statepop, values = "brewByState",labels = TRUE,include = .west_south_central,color = "grey73") + scale_fill_continuous(low = "purple", high = "green", name = "Brewery Count", label = scales::comma)+ theme(legend.position = "bottom")+labs(title = "Total Brewery Count Per State")
WSCplot
## Warning: Use of `map_df$x` is discouraged. Use `x` instead.
## Warning: Use of `map_df$y` is discouraged. Use `y` instead.
## Warning: Use of `map_df$group` is discouraged. Use `group` instead.
## Warning: Use of `centroid_labels$x` is discouraged. Use `x` instead.
## Warning: Use of `centroid_labels$y` is discouraged. Use `y` instead.
## Warning: Use of `centroid_labels$abbr` is discouraged. Use `abbr` instead.
#########################################################
#########################################################
#Break down by region, Mountain eighth
Mplot <- plot_usmap(data=statepop, values = "brewByState",labels = TRUE,include = .mountain,color = "grey73") + scale_fill_continuous(low = "purple", high = "green", name = "Brewery Count", label = scales::comma)+ theme(legend.position = "bottom")+labs(title = "Total Brewery Count Per State")
Mplot
## Warning: Use of `map_df$x` is discouraged. Use `x` instead.
## Warning: Use of `map_df$y` is discouraged. Use `y` instead.
## Warning: Use of `map_df$group` is discouraged. Use `group` instead.
## Warning: Use of `centroid_labels$x` is discouraged. Use `x` instead.
## Warning: Use of `centroid_labels$y` is discouraged. Use `y` instead.
## Warning: Use of `centroid_labels$abbr` is discouraged. Use `abbr` instead.
#########################################################
#########################################################
#Break down by region, Pacific ninth
Pplot <- plot_usmap(data=statepop, values = "brewByState",labels = TRUE,include = .pacific,color = "grey73") + scale_fill_continuous(low = "purple", high = "green", name = "Brewery Count", label = scales::comma)+ theme(legend.position = "bottom")+labs(title = "Total Brewery Count Per State")
Pplot
## Warning: Use of `map_df$x` is discouraged. Use `x` instead.
## Warning: Use of `map_df$y` is discouraged. Use `y` instead.
## Warning: Use of `map_df$group` is discouraged. Use `group` instead.
## Warning: Use of `centroid_labels$x` is discouraged. Use `x` instead.
## Warning: Use of `centroid_labels$y` is discouraged. Use `y` instead.
## Warning: Use of `centroid_labels$abbr` is discouraged. Use `abbr` instead.
#########################################################
######################
# #
# Question 2 #
# #
######################
#############################################################################################################
#Merge beer data with the breweries data. Print the first 6 observations and the last six observations to check the merged file. (RMD only, this does not need to be included in the presentation or the deck.)
#############################################################################################################
#############################################################################################################
#
# We merged the breweries.csv dataset with the beers.csv dataset, additionally when we imported
# the individual datasets, we also imported a dataset that allows us to associate each beer with
# its brewery's US Census Division.
#
#############################################################################################################
#Use Dplyr package to merge the two tables together
buzzbrews <- merge(breweryDat, beerDat, by.x = "Brew_ID", by.y = "Brewery_id", all = TRUE )
#Use Dplyr package to rename "Name.x" to "Brewery" and "Name.y" to "Beer"
buzzbrews <- dplyr::rename(buzzbrews, "Brewery" = "Name.x","Beer"="Name.y")
#Check the results
#View(buzzbrews)
######################
# #
# Question 3 #
# #
######################
###########################################
# Question 3 - Address the missing values in each column.
#
# During the initial exploratory process we discovered NA's in both the IBU and ABV columns.
# Upon further investigation we determined that some styles of beer, mixed or barrel aged beers
# do not have an ABV available at the time the brewery submits packaging labels to TTB, or Alcohol
# and Tobacco Tax and Trade Bureau. The TTB is the federal agency that determines what can and cannot
# be put on a beer label including the art, type size, verbiage, where elements are placed and etc.
# So beers without an AVB available either do not inlcude it, or add it to the bottom of the cans
# or packaging at a later date.
#
# In terms of the missing IBU values, we determined that even though the IBU alludes to the bitterness
# of a beer's taste, it is somewhat misleading because it is derived from a test that measures different
# chemical compounds that are known to cause bitter flavoring. For instance, a beer may have a high IBU
# value, but due to other ingredients, such as added lactose or sucrose may actually have a sweeter taste
# than would be expected from a high IBU. The other comfounding variable is if the brewery can afford the
# equipment used to generate an IBU value, smaller breweries simply cannot afford it while the larger
# breweries typically just use IBU as a quality control measure.
#
# Finally, we concluded that imputing data or filling in the missing gaps was a good idea for this
# analysis and that was done by taking an average of from similiar styles of beer and assigning that to
# beers in the same sytle classification that did not have values. Upon random testing of different imputed
# values, by googling beers that had missing values in the dataset and comparing that to the created averages,
# it was determined that the imputed values were very close to the actual values in the marketplace.
#
###########################################
#Loop to fix numbering for Column 1 "brew ID"
iterations <- length(buzzbrews$Brew_ID)
for (i in 1:iterations) {
buzzbrews[i,1]=i
}
#Fix no style beers to none
levels(buzzbrews$Style) <- c(levels(buzzbrews$Style), "none")
for (i in 1:iterations) {
if(is.na(buzzbrews[i,9])){
}
}
for (i in 1:iterations) {
if((buzzbrews[i,9])==''){
#print(buzzbrews[i,9])
buzzbrews[i,9]="none"
}
}
#Prep new df to contain style and averages
buzzbrews$Style <- as.factor(buzzbrews$Style)
#Create a data frame with each style and a variable for average IBU
styleCount <- as.data.frame(levels(buzzbrews$Style))
styleCount$`levels(buzzbrews$Style)` <- as.character(styleCount$`levels(buzzbrews$Style)`)
#View(styleCount)
#Initialize mean ibu to zero (to avoid problems with N/As)
styleCount$meanIbu <- 0
#Make beer count to keep track of total in each style
styleCount$beerCount <- 0
#Make column for total ibus
styleCount$totalIBU <- 0
styleCount$meanABV <- 0
styleCount$ABVbeerCount <- 0
styleCount$totalABV <- 0
#Checking
#View(styleCount)
#styleCount <- styleCount[-c(1), ]
#View(styleCount)
#Calculate mean IBU for each category and store it in IBU df
#Calculate average IBU for each style and add it to df
#outer loop for all the beers
ibuSum <- 0
beerCount <- 0
i <- 1
for (i in 1:iterations) {
if(is.na(buzzbrews[i,8])) {
buzzbrews[i,8]=0
}
#inner for each style
for (j in 1:100) {
if(buzzbrews[i,9]==styleCount[j,1]){
#Compute IBU sum
styleCount[j,4] <- styleCount[j,4]+buzzbrews[i,8]
#Total of each beer count
styleCount[j,3] <- styleCount[j,3]+1
if(buzzbrews[i,8]==0){
styleCount[j,3] <- styleCount[j,3]-1
}
}
#Mean IBU for each style
styleCount[j,2] <- styleCount[j,4]/styleCount[j,3]
}}
#Add average column from style count to buzzbrews df
for (i in 1:iterations) {
if(buzzbrews[i,8]==0){
for(j in 1:100){
if(buzzbrews[i,9]==styleCount[j,1]){
buzzbrews[i,8]=styleCount[j,2]
}
}
}
}
# View(styleCount)
# View(buzzbrews)
# Now do it all again for ABV
# Calculate average ABV for each style and add it to df
# outer loop for all the beers
AlcSum <- 0
AlcVeerCount <- 0
i <- 1
for (i in 1:iterations) {
if(is.na(buzzbrews[i,7])) {
buzzbrews[i,7]=0
}
#inner for each style
for (j in 1:100) {
if(buzzbrews[i,9]==styleCount[j,1]){
#Compute ALC sum
styleCount[j,7] <- styleCount[j,7]+buzzbrews[i,7]*100
#Total of each beer count
styleCount[j,6] <- styleCount[j,6]+1
if(buzzbrews[i,7]==0){
styleCount[j,6] <- styleCount[j,6]-1
}
}
#Mean ABV for each style
styleCount[j,5] <- (styleCount[j,7]/styleCount[j,6])/100
}
}
#Add average column from style count to buzzbrews df
for (i in 1:iterations) {
if(buzzbrews[i,7]==0){
for(j in 1:100){
if(buzzbrews[i,9]==styleCount[j,1]){
buzzbrews[i,7]=styleCount[j,5]
}
}
}
}
#kill NaN's for other alcohol types with no hops
i <- 1
for(i in 1:iterations){
if(is.na(buzzbrews[i,8])){
buzzbrews[i,8] <- 0
}
}
#Check out end results - look for any leftover NAs in DF
sapply(buzzbrews, function(x) sum(is.na(x)))
## Brew_ID Brewery City State Beer Beer_ID ABV IBU Style Ounces
## 0 0 0 0 0 0 0 0 0 0
# Add in the regional data from the census bureau
buzzbrews <- merge(buzzbrews, regionData, by = "State")
# View new dataframe
view(buzzbrews)
######################
# #
# Question 4 #
# #
######################
#############################################################################################################
#Compute the median alcohol content and international bitterness unit for each state. Plot a bar chart to compare
#############################################################################################################
# We computed the MedStateABV and IBU for each state and created a visualisation that allowed
# us to further explore what those medians tell us. We found there appears to be a relationship
# between IBU and ABV where we can use IBU to estimate ABV of a given beer.
#
# We explored this further by developing a model to make predictions based on historical IBU
# and ABV data and were able to predict that a beer with 32 IBU could have an ABV of 5.72% and
# we were 97.5% confident that beer would at least fall between 3.24% and 8.21%.
#
#################################################################################################################
buzzbrews$State <- trimws(buzzbrews$State)
# Group by state and compute
combineddf <- buzzbrews %>%
group_by(State) %>%
dplyr::summarise(MedStateIBU = median(IBU), MedStateABV = median(ABV))
## `summarise()` ungrouping output (override with `.groups` argument)
combineddf <- as.data.frame(combineddf)
combineddf$MedStateIBU <- as.numeric(combineddf$MedStateIBU)
combineddf$MedStateABV <- as.numeric(combineddf$MedStateABV)
# Divisional measurements
divisiondf <- buzzbrews %>%
group_by(Division) %>%
dplyr::summarise(MedDivIBU = median(IBU), MedDivABV = median(ABV))
## `summarise()` ungrouping output (override with `.groups` argument)
# round values to xx.x ###
divisiondf$MedDivIBU <- round(divisiondf$MedDivIBU, digits = 1)
divisiondf$MedDivABV <- round(divisiondf$MedDivABV, digits = 1)
combineddf$MedStateIBU <- round(combineddf$MedStateIBU, digits = 1)
combineddf$MedStateABV <- round(combineddf$MedStateABV, digits = 1)
# Add regions to combinddf
combineddf <- merge(combineddf,regionData,by="State")
# Add in divisional values
combineddf <- merge(combineddf, divisiondf, by = "Division")
####### Create chart labels for stacked charts #####
combineddf$ABVlabel <- paste(combineddf$State, combineddf$MedStateABV)
combineddf$IBUlabel <- paste(combineddf$State, combineddf$MedStateIBU)
view(combineddf)
# Create sums of medians for labeling charts #
StateSums <- combineddf %>%
group_by(Division) %>%
dplyr::summarise(SumStateABV = sum(MedStateABV), SumStateIBU = sum(MedStateIBU))
## `summarise()` ungrouping output (override with `.groups` argument)
combineddf <- merge(combineddf, StateSums, by = "Division")
#
############################################################
######### ################
######### Draw Bar Charts ################
######### ################
############################################################
#
combineddf %>%
ggplot(aes(x=Division, y=MedStateABV,fill= reorder(State,-MedStateABV))) +
# Create stacked by chart organized by Division with States stacked in each bar
geom_bar(aes(color = "#c8102e"),stat="identity", width= 0.7, position = position_stack(), show.legend = FALSE) +
# Add state and ABV value to each state's chart position
geom_text(aes(label = ABVlabel), size = 3, position = position_stack(vjust = 0.5)) +
# Add Division ABV Values to top of each chart stack
geom_text(aes(Division, MedDivABV + SumStateABV -3, label = MedDivABV), size = 3, vjust = 1, fontface = "italic") +
# Label the chart objects
labs(title="Median ABV by State by US Census Division in the USA",
subtitle="Budweiser Consultation",
caption="source: ABV. ABV imputed where necessary.",
y = "Alcohol By Volume",
x = "States by US Census Divisions ") +
theme_classic() +
# Remove y-labels and ticks since this is a stacked chart
theme(axis.text.y = element_blank(), axis.ticks = element_blank()) +
# Wrap X-axis labels
scale_x_discrete(labels = function(x) str_wrap(x, width = 10))
#
##### Create bar plot for IBU #####
#
combineddf %>%
ggplot(aes(x=Division, y=MedStateIBU,fill= reorder(State,-MedStateIBU))) +
# Create stacked by chart organized by Division with States stacked in each bar
geom_bar(aes(color = "#c8102e"),stat="identity", width= 0.7, position = position_stack(), show.legend = FALSE) +
# Add state and IBU value to each state's chart position
geom_text(aes(label = IBUlabel), size = 3, position = position_stack(vjust = 0.5)) +
# Add Division IBU Values to top of each chart stack
geom_text(aes(Division, MedDivIBU + SumStateIBU - 15, label = MedDivIBU), size = 3, vjust = 1, fontface = "italic") +
# Label the chart objects
labs(title="Median IBU by State by US Census Division in the USA",
subtitle="Budweiser Consultation",
caption="source: IBU. IBU imputed where necessary.",
y = "Median Int'l Bitterness Unit",
x = "States by US Census Divisions ") +
theme_classic() +
# Remove y-labels and ticks since this is a stacked chart
theme(axis.text.y = element_blank(), axis.ticks = element_blank()) +
# Wrap X-axis labels
scale_x_discrete(labels = function(x) str_wrap(x, width = 10))
######################
# #
# Question 5 #
# #
######################
#############################################################################################################
# Question 5 - Which state has the maximum alcoholic (ABV) beer? Which state has the most bitter (IBU) beer?
#
# We determined that the maximum observed IBU was 138 in Oregon for Bitter Bitch Imperial IPA that
# is an American Double/ Imperial IPA from the Astoria Brewing Company in Austoria, OR.
#
# We also determined that maximum observed ABV was 12.8% in Colorado for Lee Hill Series Vol. 5 –
# Belgian Style Quadrupel Ale from Upslope Brewing Company in Boulder, CO.
#
#############################################################################################################
#Figure out which has highest ABV
MaxStateABV <- arrange(buzzbrews, desc(ABV))
print(MaxStateABV[1,4])
## [1] "Boulder"
#Figure out which has highest IBU
maxIBU <- arrange(buzzbrews,desc(IBU))
print(maxIBU[1,4])
## [1] "Astoria"
##### Question 5 Answer #####
## Colorado has the highest ABV = 12.8, Oregon has the highest IBU = 138.
###################################################
###### Create DF for just the max ABV & IBU values ######
# State measurements
maxStateValues <- buzzbrews %>%
group_by(State) %>%
dplyr::count(MaxStateABV = max(ABV), MaxStateIBU = max(IBU))
maxStateValues <- maxStateValues[,-4]
maxStateValues <- as.data.frame(maxStateValues)
maxStateValues$State <- trimws(maxStateValues$State)
str(maxStateValues)
## 'data.frame': 51 obs. of 3 variables:
## $ State : chr "AK" "AL" "AR" "AZ" ...
## $ MaxStateABV: num 6.8 9.3 6.1 9.5 9.9 ...
## $ MaxStateIBU: num 71 103 45.7 99 115 ...
view(maxStateValues)
# Divisional measurements
divMaxValdf <- buzzbrews %>%
group_by(Division) %>%
dplyr::count(MaxDivABV = max(ABV), MaxDivIBU = max(IBU))
divMaxValdf <- divMaxValdf[,-4]
divMaxValdf <- as.data.frame(divMaxValdf)
# round values to xx.x ###
maxStateValues$MaxStateABV <- round(maxStateValues$MaxStateABV, digits = 1)
maxStateValues$MaxStateIBU <- round(maxStateValues$MaxStateIBU, digits = 1)
# Add regions to maxStateValues
maxStateValues <- merge(maxStateValues,regionData,by="State")
# Add in divisional values
maxStateValues <- merge(maxStateValues, divMaxValdf, by = "Division")
####### Create chart labels for stacked charts #####
maxStateValues$ABVmaxLabel <- paste(maxStateValues$State, maxStateValues$MaxStateABV)
maxStateValues$IBUmaxLabel <- paste(maxStateValues$State, maxStateValues$MaxStateIBU)
view(maxStateValues)
# Create sums of max values for labeling charts #
StateMaxSums <- maxStateValues %>%
group_by(Division) %>%
dplyr::summarise(SumStateABV = sum(MaxStateABV), SumStateIBU = sum(MaxStateIBU))
## `summarise()` ungrouping output (override with `.groups` argument)
maxStateValues <- merge(maxStateValues, StateMaxSums, by = "Division")
###################################################
###### Plot for Max ABV ###########################
maxStateValues %>%
ggplot(aes(x=Division, y=MaxStateABV,fill= reorder(State,-MaxStateABV))) +
# Create stacked by chart organized by Division with States stacked in each bar
geom_bar(aes(color = "#c8102e"),stat="identity", width= 0.7,
position = position_stack(), show.legend = FALSE) +
# Add state and ABV value to each state's chart position
geom_text(aes(label = ABVmaxLabel), size = 3, position = position_stack(vjust = 0.5)) +
# Add Division ABV Values to top of each chart stack
geom_text(aes(Division, MaxDivABV + SumStateABV, label = MaxDivABV), size = 3,
nudge_y = -7, fontface = "italic") +
# Label the chart objects
labs(title="Max ABV by State by US Census Division in the USA",
subtitle="Budweiser Consultation",
caption="source: ABV. ABV imputed where necessary.",
y = "Alcohol By Volume",
x = "States by US Census Divisions ") +
theme_classic() +
# Remove y-labels and ticks since this is a stacked chart
theme(axis.text.y = element_blank(), axis.ticks = element_blank()) +
# Wrap X-axis labels
scale_x_discrete(labels = function(x) str_wrap(x, width = 10))
###################################################
############### Chart Max IBU #####################
maxStateValues %>%
ggplot(aes(x=Division, y=MaxStateIBU,fill= reorder(State,-MaxStateIBU))) +
# Create stacked by chart organized by Division with States stacked in each bar
geom_bar(aes(color = "#c8102e"),stat="identity", width= 0.7,
position = position_stack(), show.legend = FALSE) +
# Add state and ABV value to each state's chart position
geom_text(aes(label = IBUmaxLabel), size = 3, position = position_stack(vjust = 0.5)) +
# Add Division ABV Values to top of each chart stack
geom_text(aes(Division, MaxDivIBU + SumStateIBU, label = MaxDivIBU),
size = 3, nudge_y = -75, fontface = "italic") +
# Label the chart objects
labs(title="Max IBU by State by US Census Division in the USA",
subtitle="Budweiser Consultation",
caption="source: IBU imputed where necessary.",
y = "Alcohol By Volume",
x = "States by US Census Divisions ") +
theme_classic() +
# Remove y-labels and ticks since this is a stacked chart
theme(axis.text.y = element_blank(), axis.ticks = element_blank()) +
# Wrap X-axis labels
scale_x_discrete(labels = function(x) str_wrap(x, width = 10))
######################
# #
# Question 6 #
# #
######################
#############################################################################################################
# Question 6 - Comment on the summary statistics and distribution of the ABV variable.
#
# We observed summary statistics from the ABV data showing that once we filled in the missing
# values as best as we could, there was a range of 0.10% to 12.80% with a median of 5.65% (median
# is simply the middle value if we were to arrange all the ABVs in either descending or ascending order).
#
# We also found a very common range within the overall range that went from 5.0% ABV to 6.70% ABV and
# upon further review noticed this is where many commonly mass produced beers fall, for example: Bud
# Ice (5.5%), Bud Light Platinum (6%), Natural Ice (5.9%), Bud Ice (5.5%), Budweiser (5%), Blue Moon
# (5%), Stella Artois (5%), Heinekin (5%), Pabst Blue Ribbon (4.74%) and Miller Genuinine Draft (4.6%).
#
#############################################################################################################
# Check on the distribution of ABV
# Add histogram of ABV distribution
buzzbrews %>%
ggplot(aes(x = ABV)) +
geom_histogram(binwidth = 1, color = "#ffffff", fill = "#c8102e") +
# Adjust the scale to be able to mark-up the chart later in PowerPoint to match the
# summary statistics - 1Q, middle 50%, 4Q
scale_x_continuous(breaks = seq(0, 14, by = 1)) +
stat_bin(binwidth = 1, aes(label = ..count..), vjust = -0.5, geom = "text", size = 3) +
# Adjust titles
labs(title = "Histogram of ABV Distribution",
subtitle = "Busweiser Consultation",
x = "ABV Values",
y = "Number of Beers",
caption = "source: ABV imputed where necessary.") +
theme_classic()
ABVsummary <- summary(buzzbrews$ABV)
ABVsummary
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.100 5.000 5.650 5.975 6.700 12.800
######################
# #
# Question 7 #
# #
######################
#############################################################################################################
# Question 7 - Is there an apparent relationship between the bitterness of the beer and its
# alcoholic content? Draw a scatter plot. Make your best judgment of a relationship and
# EXPLAIN your answer.
#
# We used a scatter plot to viusally explore if there is any sort of relationship between
# IBU and ABV, in other words can IBU determine ABV or can ABV be used to determine IBU.
# There was evidence of a positive relationship, but and we will discuss this further shortly,
# it appears one can potentially predict the other.
#
#############################################################################################################
# Label Ales, IPAs and neither - this is also setting up for the next question,
# but we want to be able to utilize it here
buzzbrews$IPAAle = case_when(grepl("\\bIPA\\b", buzzbrews$Beer, ignore.case = TRUE) ~ "IPA",
grepl("\\bindia pale ale\\b", buzzbrews$Beer, ignore.case = TRUE) ~ "IPA",
grepl("\\bale\\b", buzzbrews$Beer, ignore.case = TRUE ) ~ "Ale",
TRUE ~ "Neither")
#view(buzzbrews)
## Calculate slope and intercept of line of best fit ##
comparisonCoef <- coef(lm(ABV ~ IBU, buzzbrews))
comparisonCoef
## (Intercept) IBU
## 4.71799073 0.03142639
# (Intercept) MaxIBU
# 4.71799073 0.03142639
# Remove this chart if we decide to use the one labeled with IPA, Ale, Neither
buzzbrews %>%
ggplot(aes(x = IBU, y = ABV, color = "#c8102e")) +
geom_point(show.legend = FALSE, na.rm = TRUE) +
geom_abline(intercept = comparisonCoef[1] , slope = comparisonCoef[2], color = "#c8102E", size = 1) +
theme_classic() +
labs(title = "IBU vs ABV",
subtitle = "Budweiser Consultation",
y = "Alcohol By Volume",
x = "Int'l Bitterness Unit",
caption="ABV and IBU values imputed where necessary.")
buzzbrews %>%
ggplot(aes(x = IBU, y = ABV, color = IPAAle)) +
geom_point(show.legend = TRUE, na.rm = TRUE) +
geom_abline(intercept = comparisonCoef[1] , slope = comparisonCoef[2], color = "#c8102E", size = 1) +
theme_classic() +
labs(title = "IBU vs ABV",
subtitle = "Budweiser Consultation",
y = "Alcohol By Volume",
x = "Int'l Bitterness Unit",
caption="ABV and IBU values imputed where necessary.") +
scale_color_manual(values = c("#c8102e","#13294b","#b1b3b3"),
name = "Type of Beer",
breaks = c("Ale", "IPA", "Neither"),
labels = c("Ale", "IPA", "Neither"))
######################
# #
# Question 8 #
# #
######################
#############################################################################################################
# Question 8 - . Budweiser would also like to investigate the difference with respect to IBU and ABV
# between IPAs (India Pale Ales) and other types of Ale (any beer with “Ale” in its name other than IPA).
# You decide to use KNN classification to investigate this relationship. Provide statistical evidence one
# way or the other. You can of course assume your audience is comfortable with percentages … KNN is very easy
# to understand conceptually.
# In addition, while you have decided to use KNN to investigate this relationship (KNN is required) you may
# also feel free to supplement your response to this question with any other methods or techniques you have
# learned. Creativity and alternative solutions are always encouraged.
#
# Response:
# We built a kNN (nearest neighbor) classifier to see if there is a difference between IPA and Ale, and while
# we were at it, we also added in a third class called, "Neither." In building the kNN, we wanted to explore
# what the appropriate number of "neighbors" was to compare to since there is so many observations so close
# together (think New York city and all the noise generated). We found that generally 8 neighbors were the
# best estimation (we randomly parsed the data 100 times to find the best neighbors value).
#
# Our classifier was accurate in determining if a beer was an Ale, IPA or Neither between 59% and 67% of the time (maybe 63%-64% to be more precise) when we used 8 nearest neighbors.
# Next we created some random pairings of IBU and ABV to see how the classifier handled the data and discovered
# it again was about 64.5% accurate. It is far more accurate identify Neither style of beer 78% of the time,
# then IPAs 67.5% of the time and Ale's 26% of the time.
# We also look a look at the ranges for IBU and ABV for each of the 3 broad types of beers IPA, Ale or
# "Neither" and found the following results, showing that it should be more difficult to predict between
# the 3 different types of beers.
#
# IPAAle ABV.min ABV.med ABV.max IBU.min IBU.med IBU.max
# Ale 3.5 5.4 12.8 7 31 120
# IPA 4 6.7 9.9 19 67.6 138
# neither 0.1 5.5 12.5 0 28 130
#
# Additionally we re-visualized the plot chart with a regression line from the previous question, this
# time showing the plots colored based on the classification of Ale, IPA or neither.
#
#############################################################################################################
### For all IPA/ Ale/ Neither ###################################
##### Find the best value of K and train the model ##############
iterations = 100
numks = 25
splitPerc = .70
set.seed(33)
masterAcc = matrix(nrow = iterations, ncol = numks)
for(j in 1:iterations)
{
accs = data.frame(accuracy = numeric(30), k = numeric(30))
trainIndices = sample(1:dim(buzzbrews)[1],round(splitPerc * dim(buzzbrews)[1]))
train = buzzbrews[trainIndices,]
test = buzzbrews[-trainIndices,]
for(i in 1:numks)
{
classifications = class::knn(train[,c(7,8)],test[,c(7,8)],train$IPAAle, prob = FALSE, k = i)
table(classifications,test$IPAAle)
CM = confusionMatrix(table(classifications,test$IPAAle))
masterAcc[j,i] = CM$overall[1]
}
}
MeanAcc = colMeans(masterAcc)
# Visually find the best value of k by using it's location in the dataframe based on the highest Mean value
plot(seq(1,numks,1),MeanAcc, type = "l",
col = "#c8201e",
main = "Value for K Neighbors vs Accuracy",
sub = "Budweiser Consultation",
xlab = "Value of K Neighbors",
ylab = "Accuracy Rate (Percentage)")
# Locate the value of k based on the best MeanAcc in the dataframe
kvalue = match(max(MeanAcc), MeanAcc)
max(MeanAcc)
## [1] 0.6544813
kvalue
## [1] 9
####### Best value of k = 8 between 59% - 67% Accuracy #####################
####### Train the model using k = 8 #####################
classifications = class::knn(train[,c(7,8)],test[,c(7,8)],train$IPAAle, prob = TRUE, k = kvalue, use.all = TRUE)
table(classifications,test$IPAAle)
##
## classifications Ale IPA Neither
## Ale 67 12 49
## IPA 9 64 55
## Neither 112 30 325
CM = confusionMatrix(table(classifications,test$IPAAle))
CM
## Confusion Matrix and Statistics
##
##
## classifications Ale IPA Neither
## Ale 67 12 49
## IPA 9 64 55
## Neither 112 30 325
##
## Overall Statistics
##
## Accuracy : 0.6307
## 95% CI : (0.5944, 0.666)
## No Information Rate : 0.5934
## P-Value [Acc > NIR] : 0.02199
##
## Kappa : 0.3221
##
## Mcnemar's Test P-Value : 4.24e-07
##
## Statistics by Class:
##
## Class: Ale Class: IPA Class: Neither
## Sensitivity 0.35638 0.60377 0.7576
## Specificity 0.88598 0.89627 0.5170
## Pos Pred Value 0.52344 0.50000 0.6959
## Neg Pred Value 0.79664 0.92941 0.5938
## Prevalence 0.26003 0.14661 0.5934
## Detection Rate 0.09267 0.08852 0.4495
## Detection Prevalence 0.17704 0.17704 0.6459
## Balanced Accuracy 0.62118 0.75002 0.6373
####### Test the Classifier with some random data ###
classifyMyBeers <- data.frame(ABV = c(6,6,5,4,5, 12, 7),
IBU = c(78, 65, 55, 38, 100, 148, 98))
classifications = class::knn(train[,c(7,8)],classifyMyBeers,train$IPAAle, prob = TRUE, k = kvalue)
classifications
## [1] Neither Ale Ale Neither IPA Neither IPA
## attr(,"prob")
## [1] 0.6666667 0.6666667 0.5000000 0.4444444 0.7777778 0.6666667 0.8181818
## Levels: Ale IPA Neither
############ Test Results after one Pass #######################
#Class: neither Ale Ale neither IPA neither IPA
#Prob: 0.6250000 0.6250000 0.6250000 0.7500000 0.7500000 0.5000000 0.7777778
##################################################
############# Summary data by classification #############
IPAAleSummary <- buzzbrews %>%
group_by(IPAAle) %>%
dplyr::summarise(ABV.min = min(ABV),
ABV.med = median(ABV),
ABV.max = max(ABV),
IBU.min = min(IBU),
IBU.med = median(IBU),
IBU.max = max(IBU))
## `summarise()` ungrouping output (override with `.groups` argument)
#view(IPAAleSummary)
write.csv(IPAAleSummary, file = "IPA_Ale_Summary.csv")
########################################################
##### Replot and color by beer style ###################
comparisonCoef <- coef(lm(ABV ~ IBU, buzzbrews))
comparisonCoef
## (Intercept) IBU
## 4.71799073 0.03142639
# (Intercept) MaxIBU
# 4.71799073 0.03142639
buzzbrews %>%
ggplot(aes(x = IBU, y = ABV, color = IPAAle)) +
geom_point(show.legend = TRUE, na.rm = TRUE) +
geom_abline(intercept = comparisonCoef[1] , slope = comparisonCoef[2], color = "#c8102E", size = 1) +
theme_classic() +
labs(title = "IBU vs ABV",
subtitle = "Budweiser Consultation",
y = "Alcohol By Volume",
x = "Int'l Bitterness Unit",
caption="ABV and IBU values imputed where necessary.") +
scale_color_manual(values = c("#c8102e","#13294b","#b1b3b3"),
name = "Type of Beer",
breaks = c("Ale", "IPA", "Neither"),
labels = c("Ale", "IPA", "Neither"))
#################################################################
#################################################################
### ###############
### For only IPA $ Ale ###############
##### Find the best value of K and train the model###############
##### ###############
#################################################################
# Create new DF so not to confuse with buzzbrews
# Filter Ales and IPAs only
buzzKNN <- dplyr::filter(buzzbrews, IPAAle == "IPA" | IPAAle == "Ale")
iterations = 100
numks = 25
splitPerc = .70
set.seed(33)
masterAcc = matrix(nrow = iterations, ncol = numks)
for(j in 1:iterations)
{
accs = data.frame(accuracy = numeric(30), k = numeric(30))
trainIndices = sample(1:dim(buzzKNN)[1],round(splitPerc * dim(buzzKNN)[1]))
train = buzzKNN[trainIndices,]
test = buzzKNN[-trainIndices,]
for(i in 1:numks)
{
classifications = class::knn(train[,c(7,8)],test[,c(7,8)],train$IPAAle, prob = TRUE, k = i)
table(classifications,test$IPAAle)
CM = confusionMatrix(table(classifications,test$IPAAle))
masterAcc[j,i] = CM$overall[1]
}
}
MeanAcc = colMeans(masterAcc)
# Visually find the best value of k by using it's location in the dataframe based on the highest Mean value
plot(seq(1,numks,1),MeanAcc, type = "l",
col = "#c8201e",
main = "Value for K Neighbors vs Accuracy",
sub = "Budweiser Consultation",
xlab = "Value of K Neighbors",
ylab = "Accuracy Rate (Percentage)")
# Locate the value of k based on the best MeanAcc in the dataframe
kvalue = match(max(MeanAcc), MeanAcc)
max(MeanAcc)
## [1] 0.8863958
kvalue
## [1] 3
####### Best value of k = 8 between 59% - 67% Accuracy #####################
####### Train the model using k = 8 #####################
####### Test the Classifier with some random data ###
classifyMyBeers <- data.frame(ABV = c(6,6,5,4,5, 12, 7),
IBU = c(78, 65, 55, 38, 100, 148, 98))
classifications = class::knn(train[,c(7,8)],classifyMyBeers,train$IPAAle, prob = TRUE, k = kvalue)
classifications
## [1] IPA IPA Ale Ale IPA IPA IPA
## attr(,"prob")
## [1] 1.000 1.000 0.875 0.750 1.000 1.000 1.000
## Levels: Ale IPA
############ Test Results after one Pass #######################
#Class: neither Ale Ale neither IPA neither IPA
#Prob: 0.6250000 0.6250000 0.6250000 0.7500000 0.7500000 0.5000000 0.7777778
##################################################
############# Summary data by classification #############
buzzbrews2 <- buzzbrews
IPAAleSummary <- buzzbrews2 %>%
group_by(IPAAle) %>%
dplyr::summarise(ABV.min = min(ABV),
ABV.med = median(ABV),
ABV.max = max(ABV),
IBU.min = min(IBU),
IBU.med = median(IBU),
IBU.max = max(IBU))
## `summarise()` ungrouping output (override with `.groups` argument)
view(IPAAleSummary)
IPAAleSummary
########################################################
##### Replot and color by beer style ###################
comparisonCoef <- coef(lm(ABV ~ IBU, buzzbrews2))
comparisonCoef
## (Intercept) IBU
## 4.71799073 0.03142639
# (Intercept) MaxIBU
# 4.71799073 0.03142639
buzzbrews2 %>%
ggplot(aes(x = IBU, y = ABV, color = IPAAle)) +
geom_point(show.legend = TRUE, na.rm = TRUE) +
geom_abline(intercept = comparisonCoef[1] , slope = comparisonCoef[2], color = "#c8102E", size = 1) +
theme_classic() +
labs(title = "IBU vs ABV",
subtitle = "Budweiser Consultation",
y = "Alcohol By Volume",
x = "Int'l Bitterness Unit",
caption="ABV and IBU values imputed where necessary.") +
scale_color_manual(values = c("#c8102e","#13294b","#b1b3b3"),
name = "Type of Beer",
breaks = c("Ale", "IPA", "Neither"),
labels = c("Ale", "IPA", "Neither"))
#############################################################################################################################################
#Alternative technique for classifcation
#Hypothesis: Naive Bayes is a stronger ML technique for categorical data
#We implemented a Naive Bayes classifer based on IBU and ABV as a predictor of categotization of style of beer (IPA,Ale, or Neither)
#############################################################################################################################################
#Naive Bayes Summary
#In summary the classifer correctly selects IPA about 61% of the time, all other ales around 63% of the time, and handles "other" beers about 84% of the time.
#These are consistent with what we can expect from a classifer, and this model in particular does especially well at identifying beers that aren't in the specified categories, this is unsurpirsing as many beers do not fall into this category.
#This classifer complements the KNN classifier as KNN is strong at identifying similar beers, and the Bayesian classifier better handles
#non-grouped beers
#############################################################################################################################################
#Create new DF for Naive Bayes classifier (Don't want to interfere with original buzzbrews DF)
bayesDat <- dplyr::filter(buzzbrews, IPAAle == "IPA" | IPAAle == "Ale")
#Make the classifier happy and convert outcome to factor
bayesDat$IPAAle <- as.factor(bayesDat$IPAAle)
#Run this loop to run classifier 100 times to determine mean accuracy
iterations = 100
masterAcc = matrix(nrow = iterations,ncol=3)
#Begin the loop
for(j in 1:iterations)
{
#change seed each iteration
set.seed(j)
#Determine training and testing indicies
trainIndices = sample(seq(1:length(bayesDat$Beer)),round(.8*length(bayesDat$Beer)))
trainBeer = bayesDat[trainIndices,]
testBeer = bayesDat[-trainIndices,]
#Generate model, table, and confusion matrix
model = naiveBayes(trainBeer[,c(7,8)],trainBeer$IPAAle)
table(predict(model,testBeer[,c(7,8)]),testBeer$IPAAle)
CM = confusionMatrix(table(predict(model,testBeer[,c(7,8)]),testBeer$IPAAle))
#Insert current accuracies
masterAcc[j,1] = CM$overall[1]
masterAcc[j,2] = CM$byClass[1]
masterAcc[j,3] = CM$byClass[2]
}
#Mean accuracy
MeanAcc = colMeans(masterAcc)
MeanAcc
## [1] 0.8710053 0.8941920 0.8351311
#Confusion matrix
CM
## Confusion Matrix and Statistics
##
##
## Ale IPA
## Ale 102 13
## IPA 11 63
##
## Accuracy : 0.873
## 95% CI : (0.817, 0.9169)
## No Information Rate : 0.5979
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.7348
##
## Mcnemar's Test P-Value : 0.8383
##
## Sensitivity : 0.9027
## Specificity : 0.8289
## Pos Pred Value : 0.8870
## Neg Pred Value : 0.8514
## Prevalence : 0.5979
## Detection Rate : 0.5397
## Detection Prevalence : 0.6085
## Balanced Accuracy : 0.8658
##
## 'Positive' Class : Ale
##
#############################################################################################################################################
#############################################################################################################################################
#Naive Bayes Summary
#In summary the classifer correctly selects IPA about 61% of the time, all other ales around 63% of the time, and handles "other" beers about 84% of the time.
#These are consistent with what we can expect from a classifier, and this model in particular does especially well at identifying beers that aren't in the specified categories, this is unsurprising as many beers do not fall into this category.
#This classifer complements the KNN classifier as KNN is strong at identifying similar beers, and the Bayesian classifier better handles
#non-grouped beers
#############################################################################################################################################
######################
# #
# Question 9 #
# #
######################
#############################################################################################################
#Knock their socks off! Find one other useful inference from the data that you feel Budweiser may be able to find value in.
#You must convince them why it is important and back up your conviction with appropriate statistical evidence.
#############################################################################################################
#Create new df for graph data
q9DF <- buzzbrews
#Create pairs plot for obvious evidence of relationships
scatterplotMatrix(~Style+IBU+ABV+Ounces, data=q9DF , col="#c8102e", cex=0.5 ,
pch=c(15,16,17) ,
main="Scatter plots for visual evidence of relationships"
)
#Closer examination of relationship between style and ibu
ggplot(q9DF, aes(x=Style, y=IBU, color=State)) + geom_point(size=2) +
theme_classic() +
labs(title = "IBU vs Style",
subtitle = "Budweiser Consultation",
x = "Beer Style (100 differnt styles)",
y = "Int'l Bitterness Unit (where available)",
caption="ABV and IBU values imputed where necessary.")+theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 5))
#Do some anova
ibustyleLM <- lm(IBU~Style,data = q9DF)
summary(ibustyleLM)
##
## Call:
## lm(formula = IBU ~ Style, data = q9DF)
##
## Residuals:
## Min 1Q Median 3Q Max
## -63.320 -2.593 0.000 0.407 78.701
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.200e+01 7.182e+00 3.063 0.002215
## StyleAltbier 1.212e+01 7.715e+00 1.572 0.116161
## StyleAmerican Adjunct Lager -1.100e+01 7.570e+00 -1.453 0.146355
## StyleAmerican Amber / Red Ale 1.430e+01 7.236e+00 1.976 0.048261
## StyleAmerican Amber / Red Lager 1.250e+00 7.425e+00 0.168 0.866331
## StyleAmerican Barleywine 7.400e+01 9.272e+00 7.981 2.26e-15
## StyleAmerican Black Ale 4.690e+01 7.379e+00 6.356 2.49e-10
## StyleAmerican Blonde Ale -1.016e+00 7.248e+00 -0.140 0.888492
## StyleAmerican Brown Ale 7.895e+00 7.284e+00 1.084 0.278536
## StyleAmerican Dark Wheat Ale 5.600e+00 8.144e+00 0.688 0.491737
## StyleAmerican Double / Imperial IPA 7.132e+01 7.250e+00 9.837 < 2e-16
## StyleAmerican Double / Imperial Pilsner 6.300e+01 1.016e+01 6.203 6.56e-10
## StyleAmerican Double / Imperial Stout 4.000e+01 7.940e+00 5.038 5.07e-07
## StyleAmerican India Pale Lager 4.133e+01 9.272e+00 4.458 8.67e-06
## StyleAmerican IPA 4.563e+01 7.199e+00 6.339 2.77e-10
## StyleAmerican Malt Liquor -2.200e+01 1.244e+01 -1.769 0.077100
## StyleAmerican Pale Ale (APA) 2.294e+01 7.211e+00 3.181 0.001486
## StyleAmerican Pale Lager 4.750e+00 7.364e+00 0.645 0.518961
## StyleAmerican Pale Wheat Ale -1.311e+00 7.256e+00 -0.181 0.856577
## StyleAmerican Pilsner 1.286e+00 7.464e+00 0.172 0.863247
## StyleAmerican Porter 9.923e+00 7.287e+00 1.362 0.173399
## StyleAmerican Stout 1.931e+01 7.364e+00 2.623 0.008783
## StyleAmerican Strong Ale 4.342e+01 7.678e+00 5.655 1.75e-08
## StyleAmerican White IPA 2.683e+01 7.808e+00 3.437 0.000599
## StyleAmerican Wild Ale 5.000e+00 8.293e+00 0.603 0.546623
## StyleBaltic Porter 3.200e+01 8.293e+00 3.859 0.000117
## StyleBelgian Dark Ale 7.667e+00 7.808e+00 0.982 0.326229
## StyleBelgian IPA 3.500e+01 7.570e+00 4.623 3.99e-06
## StyleBelgian Pale Ale 1.875e+00 7.475e+00 0.251 0.801969
## StyleBelgian Strong Dark Ale 5.000e+01 8.293e+00 6.029 1.91e-09
## StyleBelgian Strong Pale Ale 3.000e+00 8.144e+00 0.368 0.712618
## StyleBerliner Weissbier -1.420e+01 7.808e+00 -1.819 0.069081
## StyleBière de Garde 1.000e+00 8.144e+00 0.123 0.902279
## StyleBock 3.000e+00 8.144e+00 0.368 0.712618
## StyleBraggot -2.200e+01 1.244e+01 -1.769 0.077100
## StyleCalifornia Common / Steam Beer 1.900e+01 8.293e+00 2.291 0.022048
## StyleChile Beer 2.000e+00 9.272e+00 0.216 0.829236
## StyleCider -2.200e+01 7.374e+00 -2.984 0.002878
## StyleCream Ale -1.667e+00 7.425e+00 -0.224 0.822425
## StyleCzech Pilsener 1.181e+01 7.434e+00 1.589 0.112203
## StyleDoppelbock 2.500e+00 8.144e+00 0.307 0.758879
## StyleDortmunder / Export Lager 1.800e+00 8.293e+00 0.217 0.828189
## StyleDubbel -1.000e+00 8.498e+00 -0.118 0.906334
## StyleDunkelweizen -6.000e+00 8.796e+00 -0.682 0.495230
## StyleEnglish Barleywine 4.467e+01 9.272e+00 4.817 1.55e-06
## StyleEnglish Bitter 8.667e+00 9.272e+00 0.935 0.350027
## StyleEnglish Brown Ale 1.200e+00 7.570e+00 0.159 0.874068
## StyleEnglish Dark Mild Ale -5.484e-13 8.293e+00 0.000 1.000000
## StyleEnglish India Pale Ale (IPA) 3.271e+01 7.715e+00 4.241 2.32e-05
## StyleEnglish Pale Ale 9.400e+00 7.757e+00 1.212 0.225734
## StyleEnglish Pale Mild Ale -6.000e+00 9.272e+00 -0.647 0.517620
## StyleEnglish Stout 4.400e+01 1.016e+01 4.332 1.54e-05
## StyleEnglish Strong Ale 3.200e+01 8.796e+00 3.638 0.000281
## StyleEuro Dark Lager 5.000e+00 8.498e+00 0.588 0.556330
## StyleEuro Pale Lager -3.332e-13 1.016e+01 0.000 1.000000
## StyleExtra Special / Strong Bitter (ESB) 2.371e+01 7.533e+00 3.148 0.001663
## StyleFlanders Oud Bruin 1.000e+00 1.244e+01 0.080 0.935935
## StyleFlanders Red Ale -2.200e+01 1.244e+01 -1.769 0.077100
## StyleForeign / Export Stout 1.667e+01 8.293e+00 2.010 0.044577
## StyleFruit / Vegetable Beer -7.800e+00 7.327e+00 -1.065 0.287193
## StyleGerman Pilsener 1.218e+01 7.379e+00 1.650 0.099037
## StyleGose -1.257e+01 7.867e+00 -1.598 0.110201
## StyleGrisette 3.000e+00 1.244e+01 0.241 0.809447
## StyleHefeweizen -4.407e+00 7.359e+00 -0.599 0.549307
## StyleHerbed / Spiced Beer 2.000e-01 7.940e+00 0.025 0.979906
## StyleIrish Dry Stout 1.667e+01 8.498e+00 1.961 0.049965
## StyleIrish Red Ale 6.000e+00 7.757e+00 0.773 0.439333
## StyleKeller Bier / Zwickel Bier 2.333e+00 9.272e+00 0.252 0.801329
## StyleKölsch -2.222e-01 7.351e+00 -0.030 0.975886
## StyleKristalweizen -2.200e+01 1.244e+01 -1.769 0.077100
## StyleLight Lager -1.033e+01 7.757e+00 -1.332 0.182972
## StyleLow Alcohol Beer -2.200e+01 1.244e+01 -1.769 0.077100
## StyleMaibock / Helles Bock 5.500e+00 8.498e+00 0.647 0.517551
## StyleMärzen / Oktoberfest 9.048e-01 7.418e+00 0.122 0.902928
## StyleMead -2.200e+01 8.498e+00 -2.589 0.009689
## StyleMilk / Sweet Stout 3.167e+00 7.867e+00 0.403 0.687352
## StyleMunich Dunkel Lager -1.667e+00 8.796e+00 -0.189 0.849735
## StyleMunich Helles Lager -4.083e+00 7.533e+00 -0.542 0.587806
## Stylenone 2.000e+00 8.498e+00 0.235 0.813954
## StyleOatmeal Stout 6.091e+00 7.570e+00 0.805 0.421155
## StyleOld Ale 1.800e+01 1.016e+01 1.772 0.076492
## StyleOther -6.000e+00 1.244e+01 -0.482 0.629615
## StylePumpkin Ale 2.818e+00 7.488e+00 0.376 0.706673
## StyleQuadrupel (Quad) 2.000e+00 8.796e+00 0.227 0.820153
## StyleRadler -2.333e+00 9.272e+00 -0.252 0.801329
## StyleRauchbier -2.200e+01 1.016e+01 -2.166 0.030411
## StyleRoggenbier -2.000e+00 1.016e+01 -0.197 0.843914
## StyleRussian Imperial Stout 6.450e+01 7.808e+00 8.261 2.40e-16
## StyleRye Beer 3.000e+01 7.570e+00 3.963 7.63e-05
## StyleSaison / Farmhouse Ale 6.391e+00 7.319e+00 0.873 0.382604
## StyleSchwarzbier 9.000e+00 7.940e+00 1.134 0.257119
## StyleScotch Ale / Wee Heavy 2.231e+00 7.646e+00 0.292 0.770493
## StyleScottish Ale 4.909e+00 7.551e+00 0.650 0.515649
## StyleShandy -2.200e+01 9.272e+00 -2.373 0.017737
## StyleSmoked Beer 1.300e+01 1.244e+01 1.045 0.296107
## StyleTripel 1.500e+00 7.808e+00 0.192 0.847665
## StyleVienna Lager 2.357e+00 7.533e+00 0.313 0.754363
## StyleWheat Ale 2.000e+00 1.244e+01 0.161 0.872282
## StyleWinter Warmer 2.625e+00 7.646e+00 0.343 0.731384
## StyleWitbier -5.792e+00 7.321e+00 -0.791 0.428992
##
## (Intercept) **
## StyleAltbier
## StyleAmerican Adjunct Lager
## StyleAmerican Amber / Red Ale *
## StyleAmerican Amber / Red Lager
## StyleAmerican Barleywine ***
## StyleAmerican Black Ale ***
## StyleAmerican Blonde Ale
## StyleAmerican Brown Ale
## StyleAmerican Dark Wheat Ale
## StyleAmerican Double / Imperial IPA ***
## StyleAmerican Double / Imperial Pilsner ***
## StyleAmerican Double / Imperial Stout ***
## StyleAmerican India Pale Lager ***
## StyleAmerican IPA ***
## StyleAmerican Malt Liquor .
## StyleAmerican Pale Ale (APA) **
## StyleAmerican Pale Lager
## StyleAmerican Pale Wheat Ale
## StyleAmerican Pilsner
## StyleAmerican Porter
## StyleAmerican Stout **
## StyleAmerican Strong Ale ***
## StyleAmerican White IPA ***
## StyleAmerican Wild Ale
## StyleBaltic Porter ***
## StyleBelgian Dark Ale
## StyleBelgian IPA ***
## StyleBelgian Pale Ale
## StyleBelgian Strong Dark Ale ***
## StyleBelgian Strong Pale Ale
## StyleBerliner Weissbier .
## StyleBière de Garde
## StyleBock
## StyleBraggot .
## StyleCalifornia Common / Steam Beer *
## StyleChile Beer
## StyleCider **
## StyleCream Ale
## StyleCzech Pilsener
## StyleDoppelbock
## StyleDortmunder / Export Lager
## StyleDubbel
## StyleDunkelweizen
## StyleEnglish Barleywine ***
## StyleEnglish Bitter
## StyleEnglish Brown Ale
## StyleEnglish Dark Mild Ale
## StyleEnglish India Pale Ale (IPA) ***
## StyleEnglish Pale Ale
## StyleEnglish Pale Mild Ale
## StyleEnglish Stout ***
## StyleEnglish Strong Ale ***
## StyleEuro Dark Lager
## StyleEuro Pale Lager
## StyleExtra Special / Strong Bitter (ESB) **
## StyleFlanders Oud Bruin
## StyleFlanders Red Ale .
## StyleForeign / Export Stout *
## StyleFruit / Vegetable Beer
## StyleGerman Pilsener .
## StyleGose
## StyleGrisette
## StyleHefeweizen
## StyleHerbed / Spiced Beer
## StyleIrish Dry Stout *
## StyleIrish Red Ale
## StyleKeller Bier / Zwickel Bier
## StyleKölsch
## StyleKristalweizen .
## StyleLight Lager
## StyleLow Alcohol Beer .
## StyleMaibock / Helles Bock
## StyleMärzen / Oktoberfest
## StyleMead **
## StyleMilk / Sweet Stout
## StyleMunich Dunkel Lager
## StyleMunich Helles Lager
## Stylenone
## StyleOatmeal Stout
## StyleOld Ale .
## StyleOther
## StylePumpkin Ale
## StyleQuadrupel (Quad)
## StyleRadler
## StyleRauchbier *
## StyleRoggenbier
## StyleRussian Imperial Stout ***
## StyleRye Beer ***
## StyleSaison / Farmhouse Ale
## StyleSchwarzbier
## StyleScotch Ale / Wee Heavy
## StyleScottish Ale
## StyleShandy *
## StyleSmoked Beer
## StyleTripel
## StyleVienna Lager
## StyleWheat Ale
## StyleWinter Warmer
## StyleWitbier
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.16 on 2310 degrees of freedom
## Multiple R-squared: 0.8327, Adjusted R-squared: 0.8255
## F-statistic: 116.1 on 99 and 2310 DF, p-value: < 2.2e-16
#Look at diagnotic plots
# optional layout
layout(matrix(c(1,2,3,4),2,2))
# diagnostic plots
plot(ibustyleLM)
## Warning: not plotting observations with leverage one:
## 140, 178, 184, 956, 1538, 1880, 2024, 2322
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
#############################################################################################################
#Conclusions for question 9
#Based on the pairs plot above, there is some visual evidence indicating that we can perhaps estlablish a relationship
#between beer style and IBU
#We used a linear regression model to determine if style is related to IBU and it turns out that there is one
#We checked model assumptions for lienar regression and this data, and the assumptions are met.
#As is turns out, you can predict beer style based on IBU
#############################################################################################################